For this guiding question, I look only at 2024 data. Below, you’ll see on the each pilot practice (what schools hope to implement in the next 1-5 years) on the left and catalyst for innovation, or the reason(s) school leaders cite for innovating, across the top.
# Correlations between catalyst variables and reasons for innovating
catalysts <- full %>%
select(starts_with("catalyst_")) %>%
select(-starts_with("catalyst_key"), -contains("_other"))
pilots <- full %>%
select(starts_with("pilot_")) %>%
select(-contains("_other")) remove_zero_variance <- function(df) {
# Apply function to each column
non_constant_cols <- df[, apply(df, 2, sd) != 0]
return(non_constant_cols)
}
catalysts_clean <- remove_zero_variance(catalysts)
pilots_clean <- remove_zero_variance(pilots) #just one removed
reasons_and_futures <- cor(pilots_clean, catalysts_clean)
rfdf <- data.frame(reasons_and_futures)corrplot::corrplot(reasons_and_futures, method = "color", tl.col = "black", tl.cex = 0.3, c1.cex = 0.3, number.cex = 0.2, diag = TRUE)Kind of clunky. I’m going to modify to fix the labels manually and then expand the figure to better see the correlations.
# rename catalysts
rename_catalyst_mapping <- setNames(
dictionary$clean_labels[dictionary$variable_name %in% colnames(catalysts)],
dictionary$variable_name[dictionary$variable_name %in% colnames(catalysts)]
)
# rename pilots
rename_pilot_mapping <- setNames(
dictionary$clean_labels[dictionary$variable_name %in% colnames(pilots)],
dictionary$variable_name[dictionary$variable_name %in% colnames(pilots)]
)
# Correlations between catalyst variables and reasons for innovating
catalysts <- full %>%
select(starts_with("catalyst_")) %>%
select(-starts_with("catalyst_key"), -contains("_other")) %>%
rename_with(~ rename_catalyst_mapping[.x], .cols = everything())
pilots <- full %>%
select(starts_with("pilot_")) %>%
select(-contains("_other")) %>%
rename_with(~ rename_pilot_mapping[.x], .cols = everything())
catalysts_clean <- remove_zero_variance(catalysts)
pilots_clean <- remove_zero_variance(pilots) #just one removed
reasons_and_futures <- cor(pilots_clean, catalysts_clean)
rfdf <- data.frame(reasons_and_futures)corrplot::corrplot(reasons_and_futures, method = "color", tl.col = "black", tl.cex = 0.6, c1.cex = 0.3, number.cex = 0.2, diag = TRUE, cl.pos = "n")The following are related to each catalyst. Correlations above 0.15 are noted.
catalyst_all_years <- import(here("data/longitudinal", "longitudinal_data.csv")) %>%
select(year, school_id, starts_with("catalyst")) %>%
filter(year == 2021 | year == 2024)
catalyst_all_years_long <- catalyst_all_years %>%
select(-contains("_other"), -contains("_key")) %>%
pivot_longer(cols = contains("catalyst"),
names_to = "catalyst",
values_to = "selected",
names_prefix = "catalyst_")From Cycle 2, I had started to create this graph. This shows catalyst selection across schools.
library(ggrepel)
catalyst_all_years_long <- catalyst_all_years_long %>%
group_by(catalyst, year) %>%
summarize(total_selected = sum(selected))
label_positions <- catalyst_all_years_long %>%
group_by(catalyst) %>%
summarize(year = 2021, selected = first(total_selected))
catalyst_all_years_long %>%
ggplot(aes(x = year, y = total_selected, color = catalyst)) +
geom_line(linewidth = 1) +
geom_point(size = 3) +
geom_label_repel(data = label_positions, aes(y = selected, label = catalyst)) +
scale_x_continuous(breaks = c(2021, 2024)) +
labs(x = "Year",
y = "Number of Times Selected",
title = "Catalyst Selection by Year \n Across Schools") +
theme(legend.position = "none")Version using percentages is added here.
n_2021 = 232
n_2024 = 189
catalyst_all_years_long <- catalyst_all_years_long %>%
mutate(pct = case_when(year == 2021 ~ total_selected/n_2021,
year == 2024 ~ total_selected/n_2024))
label_positions <- catalyst_all_years_long %>%
group_by(catalyst) %>%
summarize(year = 2021, pct = first(pct))
catalyst_all_years_long %>%
ggplot(aes(x = year, y = pct, color = catalyst)) +
geom_line(linewidth = 1) +
geom_point(size = 3) +
geom_label_repel(data = label_positions, aes(y = pct, label = catalyst)) +
scale_x_continuous(breaks = c(2021, 2024)) +
scale_y_continuous(labels = scales::percent_format()) +
labs(x = "Year",
y = "Percent of Times Selected",
title = "Catalyst Selection by Year \n Across Schools") +
theme(legend.position = "none")What about schools who responded to the survey both years? So, looking within schools? Let’s narrow the sample and check that out.
catalyst_all_years_within <- catalyst_all_years %>%
filter(duplicated(school_id) | duplicated(school_id, fromLast = TRUE)) %>%
select(-contains("_other"), -contains("_key")) %>%
pivot_longer(cols = contains("catalyst"),
names_to = "catalyst",
values_to = "selected",
names_prefix = "catalyst_") %>%
group_by(catalyst, year) %>%
summarize(total_selected = sum(selected))
label_positions <- catalyst_all_years_within %>%
group_by(catalyst) %>%
summarize(year = 2021, selected = first(total_selected))
catalyst_all_years_within %>%
ggplot(aes(x = year, y = total_selected, color = catalyst)) +
geom_line(linewidth = 1) +
geom_point(size = 3) +
geom_label_repel(data = label_positions, aes(y = selected, label = catalyst)) +
scale_x_continuous(breaks = c(2021, 2024)) +
labs(x = "Year",
y = "Number of Times Selected",
title = "Catalyst Selection by Year \n Within Schools") +
theme(legend.position = "none")Now, in percentage. The total value here will be different than in
the across graph since not ever school answered every year.
Only 82 did.
n_within = 82
catalyst_all_years_within <- catalyst_all_years_within %>%
mutate(pct = total_selected/n_within)
label_positions <- catalyst_all_years_within %>%
group_by(catalyst) %>%
summarize(year = 2021, pct = first(pct))
catalyst_all_years_within %>%
ggplot(aes(x = year, y = pct, color = catalyst)) +
geom_line(linewidth = 1) +
geom_point(size = 3) +
geom_label_repel(data = label_positions, aes(y = pct, label = catalyst)) +
scale_x_continuous(breaks = c(2021, 2024)) +
scale_y_continuous(labels = scales::percent_format()) +
labs(x = "Year",
y = "Percent of Times Selected",
title = "Catalyst Selection by Year \n Within Schools") +
theme(legend.position = "none")Create a version similar to the most added practices figures.
catalyst_all_years_within_change <- catalyst_all_years_within %>%
pivot_wider(names_from = year,
values_from = c(total_selected, pct))
catalyst_all_years_within_change %>%
ggplot(aes(x = total_selected_2021, xend = total_selected_2024, y = reorder(catalyst, total_selected_2024), yend = catalyst)) +
geom_segment(color = "black", linetype = "dotted") +
geom_point(aes(x = total_selected_2021), color = "red") +
geom_point(aes(x = total_selected_2024), color = "blue") +
guides(col = guide_legend(nrow = 1, title = NULL)) +
bar_x_scale_count +
geom_text(
aes(x = (total_selected_2021 + total_selected_2024)/2, label = paste("Δ =", total_selected_2024 - total_selected_2021), color = factor(sign(total_selected_2024 - total_selected_2021))),
nudge_y = .3,
hjust = 0,
show.legend = FALSE
) +
scale_color_manual(
values = c("red", "blue"),
labels = c("2021", "2024")
) +
labs(
y = "Catalysts",
x = "Times Selected",
title = "Catalyst Selection from 2021 to 2024 Within Schools"
) +
theme(
panel.grid.major.y = element_blank(),
legend.position = "bottom",
axis.text.y = element_text(size = rel(0.6))
)How many schools selected just one catalyst in particular?
one_cat <- catalyst_all_years %>%
select(-contains("_other"), -contains("_key")) %>%
mutate(cat_select = rowSums(across(3:11))) %>%
filter(cat_select == 1) %>%
pivot_longer(cols = contains("catalyst"),
names_to = "catalyst",
values_to = "selected",
names_prefix = "catalyst_") %>%
filter(selected == 1) %>%
group_by(year, catalyst) %>%
summarise(count = n())one_cat %>%
ggplot(aes(x = count, y = catalyst, fill = as.factor(year))) +
geom_col(position = "dodge") +
scale_fill_manual(values = transcend_cols) +
labs(title = "Solo-Select Catalysts",
x = "Catalyst",
y = "Count",
legend = "Year")Also going to represent this information here in a change plot. Note, this is not going to be within schools since there is only once school that selected one catalyst each year. Also note, covid and student_agency were exclusive to 2021.
one_cat %>%
pivot_wider(names_from = "year",
values_from = "count") %>%
ggplot(aes(x = `2021`, xend = `2024`, y = reorder(catalyst, `2024`), yend = catalyst)) +
geom_segment(color = "black", linetype = "dotted") +
geom_point(aes(x = `2021`), color = "red") +
geom_point(aes(x = `2024`), color = "blue") +
geom_point(x = 1, y = "internal", color = "purple") +
geom_point(x = 1, y = "external", color = "purple") +
guides(col = guide_legend(nrow = 1, title = NULL)) +
# bar_x_scale_count +
geom_text(
aes(x = (`2021` + `2024`)/2 -1, label = paste("Δ =", `2024` - `2021`), color = factor(sign(`2024` - `2021`))),
nudge_y = .3,
hjust = 0,
show.legend = FALSE
) +
labs(
y = "Catalysts",
x = "Times Selected",
title = "Solo-Select Catalyst Selection \nfrom 2021 to 2024 Across Schools"
) +
theme(
panel.grid.major.y = element_blank(),
legend.position = "bottom",
axis.text.y = element_text(size = rel(0.6))
)cat_by_year <- catalyst_all_years %>%
select(year, school_id, starts_with("catalyst_key"), -contains("_other")) %>%
pivot_longer(cols = contains("catalyst"),
names_to = "catalyst_key",
values_to = "selected",
names_prefix = "catalyst_key_") %>%
pivot_wider(names_from = year,
values_from = selected) %>%
na.omit() %>% #omit schools that only answered one year
mutate(selected = case_when(`2021` == 0 & `2024` == 0 ~ "neither year",
`2021` == 0 & `2024` == 1 ~ "added",
`2021` == 1 & `2024` == 0 ~ "dropped",
`2021` == 1 & `2024` == 1 ~ "both years")) %>%
group_by(catalyst_key, selected) %>%
summarise(n = n())cat_by_year %>%
ggplot(aes(x = n, y = catalyst_key, fill = selected)) +
geom_col(position = "dodge") +
scale_fill_manual(values = transcend_cols) +
labs(title = "Catalyst Key Selection for Schools with Both Years of Data",
x = "Number of Schools Selecting",
y = "Catalyst",
legend = "Status")This is from Cycle 2. What if we looked at schools who only selected one catalyst?
one_cat_change <- catalyst_all_years %>%
select(-contains("_other"), -contains("_key")) %>%
mutate(cat_select = rowSums(across(3:11))) %>%
filter(cat_select == 1) %>%
pivot_longer(cols = contains("catalyst"),
names_to = "catalyst",
values_to = "selected",
names_prefix = "catalyst_") %>%
filter(selected == 1)Looks like there was just one school who meets this criteria. This school went from focusing on inequity to demographics.
I’m continuing to explore this question, this time with a vertical barchart of adds/drops only. Note again, these are schools who responded both years.
cat_by_year %>%
pivot_wider(names_from = "selected",
values_from = "n") %>%
mutate(dropped = -1*dropped) %>%
ggplot(aes(x = reorder(catalyst_key, -dropped))) +
geom_col(aes(y = added), fill = transcend_cols[1]) +
geom_col(aes(y = dropped), fill = transcend_cols[3]) +
geom_hline(yintercept = 0, linetype = 2) +
coord_flip() +
labs(title = "Catalyst Key Adds/Drops for \nSchools Answering Both Years",
y = "Drops (Negative) and Adds (Positive)",
x = "Catalyst")2024 only
First, am curious how the free response category responded. I generated a wordcloud of these for barriers below.
library(wordcloud)
library(tm)
responses <- barriers$barrier_other_text
# Create a text corpus
corpus <- Corpus(VectorSource(responses))
# Text preprocessing
corpus <- tm_map(corpus, content_transformer(tolower)) # Convert to lower case
corpus <- tm_map(corpus, removePunctuation) # Remove punctuation
corpus <- tm_map(corpus, removeNumbers) # Remove numbers
corpus <- tm_map(corpus, removeWords, stopwords("english")) # Remove stopwords
corpus <- tm_map(corpus, stripWhitespace) # Strip whitespace
# Create a document-term matrix
dtm <- TermDocumentMatrix(corpus)
# Convert the matrix to a dataframe
matrix <- as.matrix(dtm)
word_freqs <- sort(rowSums(matrix), decreasing=TRUE)
data <- data.frame(word=names(word_freqs), freq=word_freqs)
# Generate the wordcloud
set.seed(1234) # For reproducibility
wordcloud(words = data$word, freq = data$freq, min.freq = 1,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))While we’re here, I’m also including a wordcloud for the catalysts for innovation here.
responses <- full$catalyst_other_text
# Create a text corpus
corpus <- Corpus(VectorSource(responses))
# Text preprocessing
corpus <- tm_map(corpus, content_transformer(tolower)) # Convert to lower case
corpus <- tm_map(corpus, removePunctuation) # Remove punctuation
corpus <- tm_map(corpus, removeNumbers) # Remove numbers
corpus <- tm_map(corpus, removeWords, stopwords("english")) # Remove stopwords
corpus <- tm_map(corpus, stripWhitespace) # Strip whitespace
# Create a document-term matrix
dtm <- TermDocumentMatrix(corpus)
# Convert the matrix to a dataframe
matrix <- as.matrix(dtm)
word_freqs <- sort(rowSums(matrix), decreasing=TRUE)
data <- data.frame(word=names(word_freqs), freq=word_freqs)
# Generate the wordcloud
set.seed(1234) # For reproducibility
wordcloud(words = data$word, freq = data$freq, min.freq = 1,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))Back to barriers –
For the rest of the options, here is what leaders selected.
barriers <- barriers %>%
select(-contains("_other")) %>%
pivot_longer(cols = contains("barrier"),
names_to = "barrier",
values_to = "selected") %>%
filter(selected == 1) %>%
group_by(barrier) %>%
summarize(n = n())#nice labels -- thanks, janette!
barrier_labs <- variables %>%
select(starts_with("barrier"), -barrier_other_text) %>%
pivot_longer(cols = starts_with("barrier"),
names_to = "barrier",
values_to = "response") %>%
select(barrier) %>%
unique() %>%
mutate(label = case_when(
barrier == "barrier_local_funds" ~ "Availability of local funds",
barrier == "barrier_private_funds" ~ "Availability of private funding from foundations or donors",
barrier == "barrier_donations" ~ "Changes to in-kind donations",
barrier == "barrier_inflation" ~ "Inflation/increasing prices",
barrier == "barrier_enrollment" ~ "Changes in school enrollment",
barrier == "barrier_shortage" ~ "Staffing shortages",
barrier == "barrier_federal_funds" ~ "Expiration of federal relief funds",
barrier == "barrier_other" ~ "Some other reason"
))
barriers %>%
left_join(barrier_labs, by = "barrier") %>%
ggplot(aes(reorder(label, n), n)) +
geom_col(aes(fill = label)) +
scale_fill_manual(values = c(transcend_cols, transcend_cols2)) +
labs(title = "2024 Barriers to Sustainability",
x = "Barrier",
y = "Count") +
theme(legend.position = "none",
panel.grid.major.y = element_blank(),
axis.text.y = element_text(size = 11)) +
scale_y_continuous(expand = c(0, 0), limits = c(0, 95)) +
scale_x_discrete(labels = wrap_format(35)) +
geom_text(aes(label = n), nudge_y = 0.5, hjust = 0, color = transcend_grays[2], fontface = "bold", size = 5.5, family = "sans") +
coord_flip()Percentage
barriers %>%
left_join(barrier_labs, by = "barrier") %>%
mutate(pct = n/n_2024) %>%
ggplot(aes(reorder(label, pct), pct)) +
geom_col(aes(fill = label)) +
scale_fill_manual(values = c(transcend_cols, transcend_cols2)) +
labs(title = "2024 Barriers to Sustainability",
x = "Barrier",
y = "Percent") +
theme(legend.position = "none",
panel.grid.major.y = element_blank(),
axis.text.y = element_text(size = 11)) +
scale_y_continuous(label = scales::label_percent(), expand = c(0, 0), limits = c(0, .55)) +
scale_x_discrete(labels = wrap_format(35)) +
geom_text(aes(label = paste0(round(pct, 2)*100, "%")), nudge_y = 0.01, hjust = 0, color = transcend_grays[2], fontface = "bold", size = 5.5, family = "sans") +
coord_flip()Task 1. Modify wordcloud by using bigrams.
barriers <- full %>%
select(school_id, starts_with("barrier"))
library(wordcloud)
library(tm)
library(tidytext)
library(dplyr)
responses <- barriers$barrier_other_text
# Create a data frame from the responses
responses_df <- data.frame(text = responses, stringsAsFactors = FALSE)
# Text preprocessing using tidytext
tidy_responses <- responses_df %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
filter(!bigram %in% stop_words$word) %>%
count(bigram, sort = TRUE)
# Generate the wordcloud
set.seed(1234) # For reproducibility
wordcloud(words = tidy_responses$bigram, freq = tidy_responses$n, min.freq = 1,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))Okay, so, nothing really pops out.
library(tidytext)
library(dplyr)
library(ggplot2)
responses <- barriers$barrier_other_text
# Create a data frame from the responses
responses_df <- data.frame(text = responses, stringsAsFactors = FALSE)
# Text preprocessing and sentiment analysis using tidytext
tidy_responses <- responses_df %>%
unnest_tokens(word, text) %>%
filter(!word %in% stop_words$word) %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
# Aggregate sentiment scores
sentiment_summary <- tidy_responses %>%
group_by(sentiment) %>%
summarise(count = sum(n)) %>%
ungroup()
# Plot the sentiment summary
ggplot(sentiment_summary, aes(x = sentiment, y = count, fill = sentiment)) +
geom_bar(stat = "identity", show.legend = FALSE) +
labs(title = "Sentiment Analysis of Responses",
x = "Sentiment",
y = "Count of Words") +
theme_minimal()## # A tibble: 2 × 2
## sentiment count
## <chr> <int>
## 1 negative 5
## 2 positive 5
Nothing really here, either.
Task 2. Conduct correlation analysis with clusters instead of tags.
old_clusters <- import(here("data/clusters_through_2024.csv"))
joined_dat <- dictionary %>%
select(clean_labels, variable_name) %>%
rename(var = variable_name) %>%
right_join(., old_clusters) %>%
select(-var)catalysts <- full %>%
select(starts_with("catalyst_")) %>%
select(-starts_with("catalyst_key"), -contains("_other"))
pilots <- full %>%
select(school_id, starts_with("pilot_")) %>%
select(-contains("_other"))
# rename catalysts X
rename_catalyst_mapping <- setNames(
dictionary$clean_labels[dictionary$variable_name %in% colnames(catalysts)],
dictionary$variable_name[dictionary$variable_name %in% colnames(catalysts)]
)
# rename pilots
rename_pilot_mapping <- setNames(
dictionary$clean_labels[dictionary$variable_name %in% colnames(pilots)],
dictionary$variable_name[dictionary$variable_name %in% colnames(pilots)]
)
# Correlations between catalyst variables and reasons for innovating
catalysts <- full %>%
select(starts_with("catalyst_")) %>%
select(-starts_with("catalyst_key"), -contains("_other")) %>%
rename_with(~ rename_catalyst_mapping[.x], .cols = everything())
pilots <- full %>%
select(school_id, starts_with("pilot_")) %>%
select(-contains("_other")) %>%
rename_with(~ rename_pilot_mapping[.x], .cols = -school_id)
pilots_cluster <- pilots %>%
pivot_longer(cols = -school_id,
names_to = "clean_labels",
values_to = "selection") %>%
left_join(., joined_dat) %>%
group_by(school_id, cluster) %>%
summarise(cluster_sum = sum(selection, na.rm = TRUE), .groups = 'drop') %>%
pivot_wider(names_from = cluster, values_from = cluster_sum, values_fill = list(cluster_sum = 0)) %>%
select(-school_id)
catalysts_clean <- remove_zero_variance(catalysts)
pilots_clean <- remove_zero_variance(pilots_cluster) #just one removed
reasons_and_futures <- cor(catalysts_clean, pilots_clean)
rfdf <- data.frame(reasons_and_futures)corrplot::corrplot(reasons_and_futures, method = "color", tl.col = "black", tl.cex = 0.6, c1.cex = 0.3, number.cex = 0.2, diag = TRUE, cl.pos = "n")Task 3. Change labels and add Ns to bars.
cat_by_year %>%
mutate(catalyst_key = paste0("catalyst_", catalyst_key)) %>%
mutate(catalyst_key = factor(catalyst_key, levels = names(rename_catalyst_mapping), labels = rename_catalyst_mapping)) %>%
ggplot(aes(x = n, y = catalyst_key, fill = selected)) +
geom_col(position = position_dodge(preserve = "single"), width = 1) +
# geom_text(aes(label = n), position = position_dodge(preserve = "single"), width = 0.7, color = transcend_grays[2], fontface = "bold", size = 4.5, family = "sans") +
scale_fill_manual(values = transcend_cols) +
labs(title = "Catalyst Key Selection for Schools with Both Years of Data",
x = "Number of Schools Selecting",
y = "Catalyst",
legend = "Status") +
scale_y_discrete(labels = rename_catalyst_mapping) +
theme(
panel.grid.major.y = element_blank(),
legend.position = "bottom",
axis.text.y = element_text(size = rel(0.6))
)